{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e6e4bdc1",
   "metadata": {},
   "source": [
    "# Using the DuckDB R API\n",
    "\n",
    "## Technical requirements \n",
    "\n",
    "In order to run the examples in this notebook, you'll need to install the R dependencies for this project. You can do this by running the following command in your R session. \n",
    "\n",
    "    install.packages(c(\"duckdb\", \"tidyverse\", \"arrow\")) \n",
    "\n",
    "For complete instructions on how to set up your environment for working through the examples, please consult the *Technical requirements* section of this chapter in the book.\n",
    "\n",
    "## Working with DuckDB using R’s DBI "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d456e62a",
   "metadata": {},
   "outputs": [],
   "source": [
    "library(\"DBI\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4cabdc52",
   "metadata": {},
   "source": [
    "### Connecting to DuckDB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "227fa53a",
   "metadata": {},
   "outputs": [],
   "source": [
    "disk_conn <- dbConnect(\n",
    "    duckdb::duckdb(),\n",
    "    dbdir = \"quack.duckdb\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e000f05b",
   "metadata": {},
   "outputs": [],
   "source": [
    "read_only_conn <- dbConnect(\n",
    "    duckdb::duckdb(),\n",
    "    dbdir = \"quack.duckdb\",\n",
    "    read_only = TRUE\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca486a58",
   "metadata": {},
   "outputs": [],
   "source": [
    "mem_conn <- dbConnect(\n",
    "    duckdb::duckdb(),\n",
    "    dbdir = \":memory:\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ee8d122",
   "metadata": {},
   "source": [
    "### Reading and writing tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bcaccae3-004c-4dc7-b8cb-155ab08486ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "library(tidyverse) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "667f950a",
   "metadata": {},
   "outputs": [],
   "source": [
    "dogs_df <- read_csv(\n",
    "    file = \"NYC_Dog_Licensing_Dataset.csv\",\n",
    "    col_types = cols(\n",
    "        LicenseIssuedDate = col_date(\"%m/%d/%Y\"),\n",
    "        LicenseExpiredDate = col_date(\"%m/%d/%Y\"),\n",
    "        AnimalGender = col_factor(levels = c(\"M\", \"F\")),\n",
    "    ),\n",
    ")\n",
    "\n",
    "head(dogs_df, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5302bd9",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn <- dbConnect(\n",
    "    duckdb::duckdb(),\n",
    "    dbdir = \"nyc_dogs.duckdb\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "606a2ac6-c911-46f7-9e3d-e9d475e36c06",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbWriteTable(\n",
    "    conn,\n",
    "    \"dogs_table_from_df\",\n",
    "    dogs_df,\n",
    "    overwrite = TRUE\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c45f8f35",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbListTables(conn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fe29305",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbListFields(conn, \"dogs_table_from_df\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48e4c447",
   "metadata": {},
   "outputs": [],
   "source": [
    "dogs_from_duckdb_df <- dbReadTable(conn, \"dogs_table_from_df\")\n",
    "\n",
    "tail(dogs_from_duckdb_df, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c74b91c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbCreateTable(conn, \"empty_dogs_table\", dogs_df) \n",
    "\n",
    "dbReadTable(conn, \"empty_dogs_table\") "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad75457d",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbCreateTable( \n",
    "    conn,\n",
    "    \"good_dogs_table\",\n",
    "    c(name = \"character\", birthday = \"date\", age = \"double\")\n",
    ") \n",
    "\n",
    "dbListTables(conn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "937db46f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbDataType(conn, \"Monty\")\n",
    "\n",
    "dbDataType(conn, 6)\n",
    "\n",
    "dbDataType(conn, today())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0d4509f",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbRemoveTable(conn, \"good_dogs_table\") \n",
    "\n",
    "dbListTables(conn)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "24bd5699",
   "metadata": {},
   "source": [
    "### Querying and executing SQL statements"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c2cb978d",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbGetQuery(\n",
    "    conn,\n",
    "    \"\n",
    "    SELECT ZipCode,\n",
    "        count(*) AS num_registrations\n",
    "    FROM dogs_table_from_df\n",
    "    GROUP BY ZipCode\n",
    "    ORDER BY num_registrations DESC\n",
    "    LIMIT 10\n",
    "    \"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a0202cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbExecute(\n",
    "    conn,\n",
    "    \" \n",
    "    CREATE OR REPLACE VIEW nyc_dogs_csv_view AS\n",
    "    SELECT *\n",
    "    FROM read_csv(\n",
    "            'NYC_Dog_Licensing_Dataset.csv',\n",
    "            ignore_errors=True\n",
    "        )\n",
    "    \"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90bfbdd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbGetQuery(\n",
    "    conn,\n",
    "    \" \n",
    "    SELECT *\n",
    "    FROM nyc_dogs_csv_view\n",
    "    USING SAMPLE 3\n",
    "    \"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d93d1cf1",
   "metadata": {},
   "outputs": [],
   "source": [
    "dogs_df1 = dbGetQuery(conn, \"SELECT * FROM nyc_dogs_csv_view\")\n",
    "\n",
    "dogs_df2 = dbReadTable(conn, \"nyc_dogs_csv_view\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10a9ad54",
   "metadata": {},
   "source": [
    "### Using Prepared statements  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bf3fee0",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbGetQuery(\n",
    "    conn,\n",
    "    \"\n",
    "    SELECT *\n",
    "    FROM nyc_dogs_csv_view\n",
    "    WHERE BreedName = ? AND AnimalGender = ?\n",
    "    LIMIT ?\n",
    "    \",\n",
    "    list(\"Beagle\", \"M\", 3)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6aede18",
   "metadata": {},
   "outputs": [],
   "source": [
    "result <- dbSendStatement(conn, \"INSERT INTO dogs_table_from_df VALUES (?, ?, ?, ?, ?, ?, ?, ?)\")\n",
    "\n",
    "dbBind(result, list(\"John\", \"M\", 2013, \"Jack Russell Terrier\", 10261, \"2014-07-12\", \"2017-08-09\", 2016))\n",
    "\n",
    "dbBind(result, list(\"Lady Fluffina\", \"F\", 2014, \"Bichon Frisé\", 10302, \"2014-08-22\", \"2017-09-25\", 2016))\n",
    "\n",
    "dbClearResult(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "032937a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "tail(dbReadTable(conn, \"dogs_table_from_df\"), 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bec623a9",
   "metadata": {},
   "source": [
    "### Disconnecting from DuckDB  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d88dacba",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbDisconnect(conn, shutdown = TRUE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88b7c78a",
   "metadata": {},
   "outputs": [],
   "source": [
    "run_query <- function(db_path, sql_query) {\n",
    "    conn <- dbConnect(duckdb::duckdb(), dbdir = db_path)\n",
    "    on.exit(dbDisconnect(conn, shutdown = TRUE), add = TRUE)\n",
    "    dbGetQuery(conn, sql_query)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "982e6a91",
   "metadata": {},
   "outputs": [],
   "source": [
    "run_query(\n",
    "    \"nyc_dogs.duckdb\",\n",
    "    \"SELECT * FROM dogs_table_from_df USING SAMPLE 3\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b6db966",
   "metadata": {},
   "source": [
    "## Registering R objects as virtual tables "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38a70466-6060-48c9-a6b1-d8efb6906152",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn <- dbConnect(\n",
    "    duckdb::duckdb(),\n",
    "    dbdir = \"nyc_dogs.duckdb\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9bda5c39-a9a5-4b07-82c6-89047a685ad2",
   "metadata": {},
   "source": [
    "### Registering a dataframe as a virtual table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24b5d14d",
   "metadata": {},
   "outputs": [],
   "source": [
    "duckdb::duckdb_register(conn, \"dogs_df_view\", dogs_df)\n",
    "\n",
    "dbListTables(conn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "966ee1df",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbGetQuery(\n",
    "    conn, \n",
    "    \"SELECT * FROM dogs_df_view USING SAMPLE 3\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57d42d34",
   "metadata": {},
   "outputs": [],
   "source": [
    "duckdb::duckdb_unregister(conn, \"dogs_df_view\")\n",
    "\n",
    "dbListTables(conn)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0a1433ab",
   "metadata": {},
   "source": [
    "### Registering an Arrow table as a virtual table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b25ef284",
   "metadata": {},
   "outputs": [],
   "source": [
    "library(arrow) \n",
    "\n",
    "dogs_arrow = arrow::arrow_table(dogs_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f112000b",
   "metadata": {},
   "outputs": [],
   "source": [
    "duckdb::duckdb_register_arrow(conn, \"dogs_arrow_view\", dogs_arrow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75ad7392",
   "metadata": {},
   "outputs": [],
   "source": [
    "dbGetQuery(\n",
    "    conn, \n",
    "    \"SELECT * FROM dogs_arrow_view USING SAMPLE 3\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92a2c604",
   "metadata": {},
   "outputs": [],
   "source": [
    "duckdb::duckdb_list_arrow(conn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71fbfba6",
   "metadata": {},
   "outputs": [],
   "source": [
    "duckdb::duckdb_unregister_arrow(conn, \"dogs_arrow_view\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9025a556",
   "metadata": {},
   "source": [
    "## Using DuckDB with `dplyr`\n",
    "\n",
    "### Using `dplyr` to query dataframes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "158f77bf-3747-44f6-9f2f-47ced6acdc81",
   "metadata": {},
   "outputs": [],
   "source": [
    "dogs_df |>\n",
    "    group_by(LicenseIssuedYear = year(LicenseIssuedDate)) |>\n",
    "    summarise(count = n()) |>\n",
    "    arrange(desc(count))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a20fe9d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "dogs_df |>\n",
    "    count(\n",
    "        LicenseIssuedYear = year(LicenseIssuedDate),\n",
    "        name = \"count\",\n",
    "        sort = TRUE\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a79fe13",
   "metadata": {},
   "source": [
    "### Using `dplyr` to query DuckDB tables via `dbplyr`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13c616e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "tbl(conn, \"nyc_dogs_csv_view\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ba95f595",
   "metadata": {},
   "outputs": [],
   "source": [
    "tbl(conn, \"nyc_dogs_csv_view\") |>\n",
    "    head(3) |> \n",
    "    collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "507f8c66",
   "metadata": {},
   "outputs": [],
   "source": [
    "tbl(conn, \"nyc_dogs_csv_view\") |>\n",
    "    head(3) |>\n",
    "    show_query()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "617a05d7",
   "metadata": {},
   "source": [
    "### Data wrangling with `dplyr`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97055e23",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_dogs <- tbl(conn, \"nyc_dogs_csv_view\") |>\n",
    "    filter(!AnimalName %in% c(\"UNKNOWN\", \"NAME NOT PROVIDED\", \"NAME\", \"NONE\")) |>\n",
    "    filter(!BreedName %in% c(\"Unknown\", \"Not Provided\")) |>\n",
    "    distinct(AnimalName, AnimalGender, AnimalBirthYear, BreedName, ZipCode)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e31377c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_dogs |>\n",
    "    head(5) |>\n",
    "    collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "722a6b48",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_query <- unique_dogs |>\n",
    "    filter(AnimalGender == \"M\") |>\n",
    "    count(AnimalName, name = \"num_dogs\", sort = TRUE) |>\n",
    "    head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f9de9d17",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_query |>\n",
    "    show_query()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89bb230c",
   "metadata": {},
   "outputs": [],
   "source": [
    "table_query |>\n",
    "    collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7a9d94f",
   "metadata": {},
   "outputs": [],
   "source": [
    "pop_dog_names <- unique_dogs |>\n",
    "    filter(AnimalGender == \"F\", AnimalBirthYear > 2010) |>\n",
    "    count(AnimalBirthYear, AnimalName, name = \"NumDogs\") |>\n",
    "    slice_max(by = AnimalBirthYear, order_by = NumDogs) |>\n",
    "    arrange(AnimalBirthYear)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec40fddd",
   "metadata": {},
   "outputs": [],
   "source": [
    "pop_dog_names |>\n",
    "    collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b7af748",
   "metadata": {},
   "outputs": [],
   "source": [
    "issued_by_year = tbl(conn, \"nyc_dogs_csv_view\") |>\n",
    "    count(\n",
    "        LicenseIssuedYear = year(LicenseIssuedDate),\n",
    "        name = \"Count\",\n",
    "        sort = TRUE\n",
    "    )\n",
    "\n",
    "issued_by_year |>\n",
    "    collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd96def7",
   "metadata": {},
   "outputs": [],
   "source": [
    "issued_by_year |> \n",
    "    show_query()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd32f6b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_dogs |>\n",
    "    mutate(Hash = md5(concat(AnimalName, ZipCode))) |>\n",
    "    head(3) |>\n",
    "    collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b78bfc89",
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_dogs |>\n",
    "    count(AnimalName, name = \"Count\") |>\n",
    "    filter(Count >= 100) |>\n",
    "    mutate(EditDistance=round(jaro_winkler_similarity(AnimalName, \"BELLA\"), 3)) |>\n",
    "    arrange(desc(EditDistance)) |>\n",
    "    head(10) |>\n",
    "    collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78a6db41",
   "metadata": {},
   "source": [
    "## Summary\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52184319-5920-42c7-a5d7-4eec53d9e036",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "jupytext": {
   "cell_metadata_filter": "-all",
   "main_language": "R",
   "notebook_metadata_filter": "-all"
  },
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.3.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
